Compare Hospital Clinic’s data with other datasets

Loading libraries:

#install.packages("readr")
library("readr")
#install.packages('data.table')
library(data.table)
library(stringr)
#install.packages('ggplot2')
library(ggplot2)
library(gridExtra)
library(grid)
library(plyr)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:data.table':
## 
##     transpose
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages('flextable')
library(flextable)
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
## 
##     compose
library(tibble)
library(ComplexHeatmap)
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(RColorBrewer)
library(ggplotify)
library(here)
## here() starts at /home/natasa/Desktop/master_project/Master-Project-Natasa-Mortvanski
## 
## Attaching package: 'here'
## The following object is masked from 'package:plyr':
## 
##     here
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ tidyr     1.3.0
## ✔ lubridate 1.9.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()     masks plyr::arrange()
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::combine()     masks gridExtra::combine()
## ✖ purrr::compact()     masks plyr::compact()
## ✖ flextable::compose() masks purrr::compose()
## ✖ dplyr::count()       masks plyr::count()
## ✖ dplyr::desc()        masks plyr::desc()
## ✖ dplyr::failwith()    masks plyr::failwith()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ dplyr::id()          masks plyr::id()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ dplyr::mutate()      masks plyr::mutate()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ dplyr::rename()      masks plyr::rename()
## ✖ lubridate::second()  masks data.table::second()
## ✖ dplyr::summarise()   masks plyr::summarise()
## ✖ dplyr::summarize()   masks plyr::summarize()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load data:

# Load healthy samples' table
all_healthy <- read.csv(here("01_tidy_data", "AGP_healthy.csv.gz"), header = TRUE, sep = ",")

all_healthy <- dplyr::select(all_healthy, sample_id, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, pielou_evenness, gini_index, strong, simpson, faith_pd, sex, race, host_age, condition)

names(all_healthy)[names(all_healthy) == 'host_age'] <- 'age'

#Load *CDI data from NCBI* data
CDI <- read.csv(here("01_tidy_data", "ncbi_CDI.csv.gz"), header = TRUE, sep = ",")

# Load CDI and donor data from Hospital Clínic
hospital_CDI <- read.csv(here("01_tidy_data", "hosp_CDI.csv.gz"), header = TRUE, sep = ",")
hospital_donor <- read.csv(here("01_tidy_data", "hosp_donor.csv.gz"), header = TRUE, sep = ",")

Let’s define vector of names of the alpha diversity metrics that are going to be analysed:

metric <- c("chao1", "margalef", "menhinick", "fisher_alpha", "faith_pd", "gini_index", "strong", "pielou_evenness", "shannon_entropy", "simpson") 

Let’s define function for plotting violin plots that we are going to use in the whole analysis:

plot_violin <- function(df, column){
  violin <- vector('list', length(metric))
  
  for (i in 1:length(metric)){
    mean_line <- df %>% dplyr::group_by(.data[[column]]) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
    plot_data <- df %>% dplyr::group_by(.data[[column]]) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 
  
    violin[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(.data[[column]], -m), color = .data[[column]], fill = .data[[column]])) +
      geom_violin()+
      geom_boxplot(width=0.1, color="grey", alpha=0.2) +
      geom_vline(data = mean_line, aes(xintercept = grp_mean, color = .data[[column]]), linetype = "dashed")+ 
      labs(x = metric[i])+
      ylab("")+
      theme(legend.position="none") 
    
    if(metric[i] != "shannon_entropy" & metric[i] != "strong" & metric[i] != "gini_index"  & metric[i] != "menhinick"){
     violin[[i]] <- violin[[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }
  }
  return(violin)
}

Function for doing Mann-Whitney-Wilcoxon test:

do_wilcox_test <- function(df, column){
  test <- list()
  
  for (i in 1:length(metric)){
    test[[i]] <- pairwise.wilcox.test(df[[metric[i]]], df[[column]], p.adjust.method="none") %>% 
    broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
    test[[i]]$p.value <- round(test[[i]]$p.value, digits = 5)
  }
  
  tests <- do.call(what = rbind, args = test)
  
  return(tests)
}

First, lets examine difference in alpha diversity before and after FMT in CDI data set:

hospital_CDI$FMT_pre_post <- as.factor(hospital_CDI$FMT_pre_post)
hospital_CDI$FMT_pre_post <- ordered(hospital_CDI$FMT_pre_post, levels = c("pre", "post"))
table1(~ chao1 + margalef + menhinick + fisher_alpha + faith_pd + gini_index + strong + pielou_evenness + shannon_entropy + simpson | FMT_pre_post, data=hospital_CDI)
pre
(N=18)
post
(N=38)
Overall
(N=56)
chao1
Mean (SD) 89.3 (36.1) 133 (61.3) 119 (57.9)
Median [Min, Max] 88.1 [29.0, 159] 126 [41.0, 298] 109 [29.0, 298]
margalef
Mean (SD) 9.75 (3.92) 14.1 (5.99) 12.7 (5.75)
Median [Min, Max] 9.52 [3.25, 17.6] 13.9 [4.64, 28.2] 11.6 [3.25, 28.2]
menhinick
Mean (SD) 1.15 (0.455) 1.65 (0.695) 1.49 (0.667)
Median [Min, Max] 1.12 [0.391, 2.06] 1.63 [0.553, 3.29] 1.36 [0.391, 3.29]
fisher_alpha
Mean (SD) 14.6 (6.84) 22.8 (11.7) 20.1 (11.0)
Median [Min, Max] 13.9 [4.01, 29.2] 21.9 [6.01, 52.3] 17.6 [4.01, 52.3]
faith_pd
Mean (SD) 13.0 (3.84) 15.1 (4.82) 14.4 (4.61)
Median [Min, Max] 12.3 [5.45, 19.8] 14.3 [7.15, 27.0] 14.2 [5.45, 27.0]
gini_index
Mean (SD) 0.997 (0.00156) 0.996 (0.00200) 0.996 (0.00197)
Median [Min, Max] 0.997 [0.994, 0.999] 0.996 [0.991, 0.999] 0.996 [0.991, 0.999]
strong
Mean (SD) 0.496 (0.0806) 0.470 (0.0521) 0.478 (0.0632)
Median [Min, Max] 0.490 [0.355, 0.660] 0.473 [0.351, 0.615] 0.476 [0.351, 0.660]
pielou_evenness
Mean (SD) 0.815 (0.0685) 0.848 (0.0410) 0.837 (0.0531)
Median [Min, Max] 0.829 [0.669, 0.911] 0.857 [0.698, 0.913] 0.846 [0.669, 0.913]
shannon_entropy
Mean (SD) 5.14 (0.828) 5.78 (0.708) 5.58 (0.800)
Median [Min, Max] 5.23 [3.92, 6.41] 5.86 [4.02, 7.04] 5.68 [3.92, 7.04]
simpson
Mean (SD) 0.951 (0.0285) 0.969 (0.0175) 0.963 (0.0231)
Median [Min, Max] 0.959 [0.892, 0.986] 0.975 [0.902, 0.990] 0.971 [0.892, 0.990]
violin_hospital_2 <- vector('list', length(metric))

violin_hospital_2 <- plot_violin(hospital_CDI, "FMT_pre_post")

grid.arrange(violin_hospital_2[[1]], violin_hospital_2[[2]], violin_hospital_2[[3]], violin_hospital_2[[4]], violin_hospital_2[[5]], violin_hospital_2[[6]], violin_hospital_2[[7]], violin_hospital_2[[8]], violin_hospital_2[[9]], violin_hospital_2[[10]], ncol=3) 

Lets see how alpha diversity of CDI data before FMT diverges from stool donor data from Hospital Clínic:

hospital_CDI_pre_FMT <- hospital_CDI %>%
  filter(FMT_pre_post == "pre")

compare_hospital <- rbind.fill(hospital_donor, hospital_CDI_pre_FMT)

compare_hospital$condition <- as.factor(compare_hospital$condition)

# Sizes of each dataset
table(compare_hospital$condition)
## 
##       Cdif_pre healthy_donors 
##             18            113
table1(~ chao1 + margalef + menhinick + fisher_alpha + faith_pd + gini_index + strong + pielou_evenness + shannon_entropy + simpson | condition, data=compare_hospital)
Cdif_pre
(N=18)
healthy_donors
(N=113)
Overall
(N=131)
chao1
Mean (SD) 89.3 (36.1) 191 (41.1) 177 (53.5)
Median [Min, Max] 88.1 [29.0, 159] 189 [103, 300] 180 [29.0, 300]
margalef
Mean (SD) 9.75 (3.92) 19.5 (4.07) 18.1 (5.25)
Median [Min, Max] 9.52 [3.25, 17.6] 19.4 [10.6, 29.6] 18.4 [3.25, 29.6]
menhinick
Mean (SD) 1.15 (0.455) 1.54 (0.320) 1.48 (0.365)
Median [Min, Max] 1.12 [0.391, 2.06] 1.54 [0.841, 2.34] 1.50 [0.391, 2.34]
fisher_alpha
Mean (SD) 14.6 (6.84) 30.5 (7.55) 28.3 (9.25)
Median [Min, Max] 13.9 [4.01, 29.2] 30.3 [14.9, 50.1] 29.0 [4.01, 50.1]
faith_pd
Mean (SD) 13.0 (3.84) 21.7 (3.96) 20.5 (4.95)
Median [Min, Max] 12.3 [5.45, 19.8] 20.8 [16.4, 42.0] 20.5 [5.45, 42.0]
gini_index
Mean (SD) 0.997 (0.00156) 0.980 (0.00382) 0.983 (0.00682)
Median [Min, Max] 0.997 [0.994, 0.999] 0.981 [0.970, 0.989] 0.981 [0.970, 0.999]
strong
Mean (SD) 0.496 (0.0806) 0.388 (0.0306) 0.403 (0.0553)
Median [Min, Max] 0.490 [0.355, 0.660] 0.387 [0.319, 0.453] 0.395 [0.319, 0.660]
pielou_evenness
Mean (SD) 0.815 (0.0685) 0.904 (0.0198) 0.891 (0.0435)
Median [Min, Max] 0.829 [0.669, 0.911] 0.907 [0.852, 0.936] 0.903 [0.669, 0.936]
shannon_entropy
Mean (SD) 5.14 (0.828) 6.80 (0.290) 6.57 (0.699)
Median [Min, Max] 5.23 [3.92, 6.41] 6.77 [5.87, 7.46] 6.73 [3.92, 7.46]
simpson
Mean (SD) 0.951 (0.0285) 0.987 (0.00389) 0.982 (0.0165)
Median [Min, Max] 0.959 [0.892, 0.986] 0.987 [0.970, 0.992] 0.986 [0.892, 0.992]
violin_compare <- vector('list', length(metric))

violin_compare <- plot_violin(compare_hospital, "condition")

#violin_compare

grid.arrange(violin_compare[[1]], violin_compare[[2]], violin_compare[[3]], violin_compare[[4]], violin_compare[[5]], violin_compare[[6]], violin_compare[[7]], violin_compare[[8]], violin_compare[[9]], violin_compare[[10]], ncol=3) 

test_CDI_healthy <- list()

test_CDI_healthy <- do_wilcox_test(compare_hospital, "condition")

test_CDI_healthy_1 <- test_CDI_healthy %>% 
  add_column(p.adjusted = round(p.adjust(test_CDI_healthy$p.value, "fdr"), digits=5), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Mann-Whitney-Wilcoxon test for distributions of Hospital Clínic's CDI samples vs healthy control")

test_CDI_healthy_1

Results of the Mann-Whitney-Wilcoxon test for distributions of Hospital Clínic's CDI samples vs healthy control

parameter

group1

group2

p.value

p.adjusted

chao1

healthy_donors

Cdif_pre

0.00000

0.00000

faith_pd

healthy_donors

Cdif_pre

0.00000

0.00000

fisher_alpha

healthy_donors

Cdif_pre

0.00000

0.00000

gini_index

healthy_donors

Cdif_pre

0.00000

0.00000

margalef

healthy_donors

Cdif_pre

0.00000

0.00000

pielou_evenness

healthy_donors

Cdif_pre

0.00000

0.00000

shannon_entropy

healthy_donors

Cdif_pre

0.00000

0.00000

simpson

healthy_donors

Cdif_pre

0.00000

0.00000

strong

healthy_donors

Cdif_pre

0.00000

0.00000

menhinick

healthy_donors

Cdif_pre

0.00049

0.00049

library(ROSE)
## Loaded ROSE 0.0-4
# Let's make training and testing subset of data
#make this example reproducible
set.seed(1)

#use 70% of dataset as training set and 30% as test set
sample <- sample(c(TRUE, FALSE), nrow(compare_hospital), replace=TRUE, prob=c(0.7,0.3))
train  <- compare_hospital[sample, ]
test   <- compare_hospital[!sample, ]

table(train$condition)
## 
##       Cdif_pre healthy_donors 
##             15             77
table(test$condition)
## 
##       Cdif_pre healthy_donors 
##              3             36
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
richness <- c("chao1", "margalef", "menhinick", "fisher_alpha", "faith_pd") 
evenness <- c("gini_index", "strong", "pielou_evenness", "shannon_entropy", "simpson") 
results_accuracy_CDI <- data.frame(model = character(0), accuracy = numeric(0))

model_all <- randomForest(condition ~ shannon_entropy + chao1 + menhinick + margalef + fisher_alpha + simpson + gini_index + strong + pielou_evenness + faith_pd, data = train, importance=TRUE)

prediction_CDI_all <- predict(model_all, test)
confusion_matrix <- confusionMatrix(prediction_CDI_all, test$condition)
accuracy_CDI_all <- confusion_matrix$overall["Accuracy"]

for (a in richness){
  for (b in evenness){
    formula <- as.formula(sprintf("%s ~ %s + %s", "condition", a, b))
    model <- randomForest(formula, data = train, importance=TRUE) 
    
    # Calculating accuracy
    prediction <- predict(model, test)
    confusion_matrix <- confusionMatrix(prediction, test$condition)
    accuracy <- confusion_matrix$overall["Accuracy"]
    
    new_row <- c(model = sprintf("%s + %s", a, b), accuracy = accuracy)
    results_accuracy_CDI <- rbind(results_accuracy_CDI, new_row)
  }
}

names(results_accuracy_CDI)[1] <- 'model'
names(results_accuracy_CDI)[2] <- 'accuracy'

results_accuracy_CDI[nrow(results_accuracy_CDI)+1,] <- c("all alpha metrics", accuracy_CDI_all)

results_accuracy_CDI$accuracy <- as.numeric(results_accuracy_CDI$accuracy)

results_accuracy_CDI <- results_accuracy_CDI[order(-results_accuracy_CDI$accuracy),]

results_accuracy_CDI[1:10,] %>% 
  flextable() %>% 
  add_header_lines(values = "Accuracy of random forest classifier trained on CDI and healthy datasets in differnet models")

Accuracy of random forest classifier trained on CDI and healthy datasets in differnet models

model

accuracy

chao1 + gini_index

1.000000

margalef + gini_index

1.000000

menhinick + gini_index

1.000000

menhinick + strong

1.000000

menhinick + shannon_entropy

1.000000

fisher_alpha + gini_index

1.000000

faith_pd + gini_index

1.000000

all alpha metrics

1.000000

chao1 + strong

0.974359

menhinick + pielou_evenness

0.974359

Modified t-test

set.seed(1)

#use 70% of dataset as healthy population and 30% as healthy samples to test 
sample <- sample(c(TRUE, FALSE), nrow(hospital_donor), replace=TRUE, prob=c(0.7,0.3))
healthy_popul  <- hospital_donor[sample, ]
healthy_test   <- hospital_donor[!sample, ]

nrow(healthy_popul)
## [1] 77
nrow(healthy_test)
## [1] 36
# samples to test 
test_samples <- rbind.fill(healthy_test, hospital_CDI)

#test_samples <- rbind.fill(healthy_popul, CDI)


nrow(test_samples)
## [1] 92
library(table1)

CrawfordHowell <- function(case, control){
  tval <- (case - mean(control)) / (sd(control)*sqrt((length(control)+1) / length(control)))
  degfree <- length(control)-1
  pval <- 2*(1-pt(abs(tval), df=degfree)) #two-tailed p-value
  result <- data.frame(t = tval, df = degfree, p=pval)
  return(result)
}

#table_list <- list()

for (i in 1:length(metric)){

    t_statistics <- list()
    t_prob <- list()
    result <- data.frame()
    
    for (n in 1:nrow(test_samples)){
      result <- CrawfordHowell(test_samples[[metric[i]]][n], healthy_popul[[metric[i]]]) 
      t_statistics <- append(t_statistics, result[1])
      t_prob <- append(t_prob, result[3])
    }
    
    t_test_results <- test_samples[, c("sample_id", "condition")]
    t_test_results$metric <- test_samples[[metric[i]]]
    t_test_results$t_statistic <- unlist(t_statistics)
    t_test_results$t_probability <- unlist(t_prob)
    
    #table_list[[i]] <- t_test_results
    
    print(table1(~ metric + t_probability | condition, data=t_test_results, caption = paste("Results for alpha metric:", metric[i], " ")))
}
## <table class="Rtable1"><caption>Results for alpha metric: chao1  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>133 (61.3)</td>
## <td>89.3 (36.1)</td>
## <td>191 (44.5)</td>
## <td>147 (63.5)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>126 [41.0, 298]</td>
## <td class='lastrow'>88.1 [29.0, 159]</td>
## <td class='lastrow'>189 [117, 300]</td>
## <td class='lastrow'>147 [29.0, 300]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.228 (0.299)</td>
## <td>0.0589 (0.110)</td>
## <td>0.449 (0.278)</td>
## <td>0.282 (0.301)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0838 [0.000341, 0.978]</td>
## <td class='lastrow'>0.0120 [0.000122, 0.420]</td>
## <td class='lastrow'>0.449 [0.00834, 0.992]</td>
## <td class='lastrow'>0.155 [0.000122, 0.992]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: margalef  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>14.1 (5.99)</td>
## <td>9.75 (3.92)</td>
## <td>19.5 (4.44)</td>
## <td>15.4 (6.22)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>13.9 [4.64, 28.2]</td>
## <td class='lastrow'>9.52 [3.25, 17.6]</td>
## <td class='lastrow'>19.4 [11.9, 29.6]</td>
## <td class='lastrow'>15.4 [3.25, 29.6]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.244 (0.309)</td>
## <td>0.0830 (0.165)</td>
## <td>0.438 (0.278)</td>
## <td>0.288 (0.303)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0647 [0.000335, 0.960]</td>
## <td class='lastrow'>0.0141 [0.0000989, 0.645]</td>
## <td class='lastrow'>0.436 [0.0119, 0.963]</td>
## <td class='lastrow'>0.182 [0.0000989, 0.963]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: menhinick  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>1.65 (0.695)</td>
## <td>1.15 (0.455)</td>
## <td>1.54 (0.349)</td>
## <td>1.51 (0.563)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>1.63 [0.553, 3.29]</td>
## <td class='lastrow'>1.12 [0.391, 2.06]</td>
## <td class='lastrow'>1.53 [0.939, 2.34]</td>
## <td class='lastrow'>1.44 [0.391, 3.29]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.223 (0.240)</td>
## <td>0.301 (0.342)</td>
## <td>0.438 (0.278)</td>
## <td>0.323 (0.290)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.113 [0.000000258, 0.896]</td>
## <td class='lastrow'>0.119 [0.000408, 0.931]</td>
## <td class='lastrow'>0.436 [0.0119, 0.963]</td>
## <td class='lastrow'>0.271 [0.000000258, 0.963]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: fisher_alpha  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>22.8 (11.7)</td>
## <td>14.6 (6.84)</td>
## <td>30.6 (8.25)</td>
## <td>24.2 (11.2)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>21.9 [6.01, 52.3]</td>
## <td class='lastrow'>13.9 [4.01, 29.2]</td>
## <td class='lastrow'>30.2 [16.9, 50.1]</td>
## <td class='lastrow'>23.7 [4.01, 52.3]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.277 (0.333)</td>
## <td>0.119 (0.220)</td>
## <td>0.440 (0.279)</td>
## <td>0.310 (0.314)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0671 [0.00125, 0.990]</td>
## <td class='lastrow'>0.0261 [0.000520, 0.857]</td>
## <td class='lastrow'>0.440 [0.00883, 0.981]</td>
## <td class='lastrow'>0.178 [0.000520, 0.990]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: faith_pd  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>15.1 (4.82)</td>
## <td>13.0 (3.84)</td>
## <td>22.6 (5.91)</td>
## <td>17.6 (6.50)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>14.3 [7.15, 27.0]</td>
## <td class='lastrow'>12.3 [5.45, 19.8]</td>
## <td class='lastrow'>20.8 [16.4, 42.0]</td>
## <td class='lastrow'>17.7 [5.45, 42.0]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.161 (0.292)</td>
## <td>0.0657 (0.150)</td>
## <td>0.419 (0.310)</td>
## <td>0.243 (0.311)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.00769 [0.000000450, 0.944]</td>
## <td class='lastrow'>0.000811 [0.0000000286, 0.558]</td>
## <td class='lastrow'>0.406 [0.00000000000715, 0.998]</td>
## <td class='lastrow'>0.0591 [0.00000000000715, 0.998]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: gini_index  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.996 (0.00200)</td>
## <td>0.997 (0.00156)</td>
## <td>0.980 (0.00416)</td>
## <td>0.990 (0.00847)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.996 [0.991, 0.999]</td>
## <td class='lastrow'>0.997 [0.994, 0.999]</td>
## <td class='lastrow'>0.980 [0.971, 0.988]</td>
## <td class='lastrow'>0.995 [0.971, 0.999]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.000582 (0.00145)</td>
## <td>0.0000744 (0.000129)</td>
## <td>0.489 (0.322)</td>
## <td>0.191 (0.312)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0000756 [0.00000400, 0.00568]</td>
## <td class='lastrow'>0.0000186 [0.00000400, 0.000439]</td>
## <td class='lastrow'>0.516 [0.0156, 0.974]</td>
## <td class='lastrow'>0.000250 [0.00000400, 0.974]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: strong  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.470 (0.0521)</td>
## <td>0.496 (0.0806)</td>
## <td>0.384 (0.0270)</td>
## <td>0.441 (0.0695)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.473 [0.351, 0.615]</td>
## <td class='lastrow'>0.490 [0.355, 0.660]</td>
## <td class='lastrow'>0.386 [0.331, 0.452]</td>
## <td class='lastrow'>0.427 [0.331, 0.660]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.127 (0.217)</td>
## <td>0.116 (0.203)</td>
## <td>0.563 (0.285)</td>
## <td>0.296 (0.323)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0126 [0.00000000105, 0.887]</td>
## <td class='lastrow'>0.00538 [0.00000000000240, 0.644]</td>
## <td class='lastrow'>0.574 [0.0590, 0.971]</td>
## <td class='lastrow'>0.172 [0.00000000000240, 0.971]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: pielou_evenness  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.848 (0.0410)</td>
## <td>0.815 (0.0685)</td>
## <td>0.908 (0.0161)</td>
## <td>0.865 (0.0548)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.857 [0.698, 0.913]</td>
## <td class='lastrow'>0.829 [0.669, 0.911]</td>
## <td class='lastrow'>0.911 [0.878, 0.934]</td>
## <td class='lastrow'>0.880 [0.669, 0.934]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.167 (0.246)</td>
## <td>0.139 (0.265)</td>
## <td>0.526 (0.248)</td>
## <td>0.302 (0.307)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.0401 [0.0000000000000107, 0.844]</td>
## <td class='lastrow'>0.00146 [0, 0.914]</td>
## <td class='lastrow'>0.552 [0.129, 0.961]</td>
## <td class='lastrow'>0.266 [0, 0.961]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: shannon_entropy  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>5.78 (0.708)</td>
## <td>5.14 (0.828)</td>
## <td>6.83 (0.307)</td>
## <td>6.07 (0.895)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>5.86 [4.02, 7.04]</td>
## <td class='lastrow'>5.23 [3.92, 6.41]</td>
## <td class='lastrow'>6.81 [6.14, 7.39]</td>
## <td class='lastrow'>6.25 [3.92, 7.39]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.0784 (0.166)</td>
## <td>0.0222 (0.0594)</td>
## <td>0.487 (0.323)</td>
## <td>0.227 (0.310)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.00180 [0.00000000000000511, 0.749]</td>
## <td class='lastrow'>0.00000460 [0.00000000000000111, 0.197]</td>
## <td class='lastrow'>0.465 [0.0257, 0.987]</td>
## <td class='lastrow'>0.0447 [0.00000000000000111, 0.987]</td>
## </tr>
## </tbody>
## </table>
## <table class="Rtable1"><caption>Results for alpha metric: simpson  </caption>
## 
## <thead>
## <tr>
## <th class='rowlabel firstrow lastrow'></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_post<br><span class='stratn'>(N=38)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Cdif_pre<br><span class='stratn'>(N=18)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>healthy_donors<br><span class='stratn'>(N=36)</span></span></th>
## <th class='firstrow lastrow'><span class='stratlabel'>Overall<br><span class='stratn'>(N=92)</span></span></th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td class='rowlabel firstrow'>metric</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.969 (0.0175)</td>
## <td>0.951 (0.0285)</td>
## <td>0.987 (0.00307)</td>
## <td>0.973 (0.0216)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.975 [0.902, 0.990]</td>
## <td class='lastrow'>0.959 [0.892, 0.986]</td>
## <td class='lastrow'>0.988 [0.979, 0.992]</td>
## <td class='lastrow'>0.982 [0.892, 0.992]</td>
## </tr>
## <tr>
## <td class='rowlabel firstrow'>t_probability</td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## <td class='firstrow'></td>
## </tr>
## <tr>
## <td class='rowlabel'>Mean (SD)</td>
## <td>0.135 (0.219)</td>
## <td>0.0987 (0.237)</td>
## <td>0.539 (0.231)</td>
## <td>0.286 (0.304)</td>
## </tr>
## <tr>
## <td class='rowlabel lastrow'>Median [Min, Max]</td>
## <td class='lastrow'>0.00753 [0, 0.759]</td>
## <td class='lastrow'>0.0000000179 [0, 0.877]</td>
## <td class='lastrow'>0.597 [0.0989, 0.993]</td>
## <td class='lastrow'>0.165 [0, 0.993]</td>
## </tr>
## </tbody>
## </table>
# for (i in 1:length(metric)){
#   table1(~ metric + t_probability | condition, data=table_list[[i]], caption = paste("Results for alpha metric:", metric[i], " "))
# }

Compare Hospital’s data to AGP healthy data and CDI dataset from BioProject:

all_data_comparison <- rbind.fill(all_healthy, CDI, hospital_donor, hospital_CDI)
  
all_data_comparison <- dplyr::select(all_data_comparison, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, gini_index, strong, pielou_evenness, faith_pd, condition)

all_data_comparison$condition[all_data_comparison$condition=="healthy"] <- "healthy_AGP"
all_data_comparison$condition[all_data_comparison$condition=="CDI"] <- "CDI_BioProject"
all_data_comparison$condition[all_data_comparison$condition=="Cdif_post"] <- "hospital_CDI_post"
all_data_comparison$condition[all_data_comparison$condition=="Cdif_pre"] <- "hospital_CDI_pre"

violin_hospital_compare <- vector('list', length(metric))

violin_hospital_compare <- plot_violin(all_data_comparison, "condition")


grid.arrange(violin_hospital_compare[[1]], violin_hospital_compare[[2]], violin_hospital_compare[[3]], violin_hospital_compare[[4]], ncol=2) 

grid.arrange(violin_hospital_compare[[5]], violin_hospital_compare[[6]], violin_hospital_compare[[7]], violin_hospital_compare[[8]], ncol=2) 

grid.arrange(violin_hospital_compare[[9]], violin_hospital_compare[[10]], ncol=2) 

Taxonomy

From tutorial: https://uw-madison-microbiome-hub.github.io/Microbiome_analysis_in-_R/ and: https://rpubs.com/mohsen/lefse_analysis_cleaned_prevalence_phyloseq

CDI samples - before vs after FMT

#BiocManager::install("phyloseq")
#remotes::install_github("jbisanz/qiime2R")
#BiocManager::install("microbiome")
#remotes::install_github("microsud/microbiomeutilities")
library(phyloseq)
library(qiime2R)
library(microbiome)
## 
## microbiome R package (microbiome.github.com)
##     
## 
## 
##  Copyright (C) 2011-2021 Leo Lahti, 
##     Sudarshan Shetty et al. <microbiome.github.io>
## 
## Attaching package: 'microbiome'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:base':
## 
##     transform
library(microbiomeutilities)
## Warning: replacing previous import 'ggplot2::alpha' by 'microbiome::alpha' when
## loading 'microbiomeutilities'
# Importing ASVs abundance file
ASVs <- read_qza(here("00_raw_data/09_hospital_data_qiime2","table-CDI.qza"))

# Importing metadata
metadata <- read.delim(here("00_raw_data/07_hospital_CDI", "sample-metadata.tsv.gz"))
rownames(metadata) <- metadata[,1]
metadata$time <- as.numeric(metadata$time)

# Importing tree
tree <- read_qza(here("00_raw_data/09_hospital_data_qiime2","rooted-tree-CDI.qza"))
# Importing taxonomy
taxonomy <- read_qza(here("00_raw_data/09_hospital_data_qiime2","taxonomy-CDI.qza"))

taxonomy_table <- parse_taxonomy(taxonomy$data)

tax_table <- do.call(rbind, strsplit(as.character(taxonomy$data$Taxon), "; "))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
colnames(tax_table) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
rownames(tax_table) <- taxonomy$data$Feature.ID

# Creating phyloseq object
physeq <- phyloseq(
  otu_table(ASVs$data, taxa_are_rows = TRUE),
  phy_tree(tree$data),
  tax_table(tax_table),
  sample_data(metadata)
)

#Subset the data to keep only Receptor samples.
physeq.rec <- subset_samples(physeq, sample.purpose == "Receptor")

for(i in 1:nsamples(physeq.rec)) {
  if(sample_data(physeq.rec)$time[i] <= 0){
    sample_data(physeq.rec)$FMT_pre_post[i] <- "pre"
  } else {
    sample_data(physeq.rec)$FMT_pre_post[i] <- "post"
  }
}

sample_data(physeq.rec)$FMT_pre_post <- as.factor(sample_data(physeq.rec)$FMT_pre_post)
sample_data(physeq.rec)$FMT_pre_post <- relevel(sample_data(physeq.rec)$FMT_pre_post, "pre")
levels(sample_data(physeq.rec)$FMT_pre_post)
## [1] "pre"  "post"
#BiocManager::install("ALDEx2")
#library(ALDEx2)

#aldex2_da <- ALDEx2::aldex(data.frame(phyloseq::otu_table(physeq.rec)), phyloseq::sample_data(physeq.rec)$FMT_pre_post, test="t", effect = TRUE, denom="iqlr")

## Vulcano plot for differentially expressed OTUs
#ALDEx2::aldex.plot(aldex2_da, type="MW", test="wilcox", called.cex = 1, cutoff = 0.1)
ntaxa(physeq.rec)
## [1] 16722
nsamples(physeq.rec)
## [1] 112
otu_table(physeq.rec)[1:5, 1:5]  
## OTU Table:          [5 taxa and 5 samples]
##                      taxa are rows
##                                  CD003-M-1 CD003-M-n CD003-M2 CD003-M4
## 7841644759c6fc1e7b024272b6dcf462         0         0        0        0
## 555cdf14776844cf94aeb170e12170de         0         6        0        0
## fc885b5c51d9581397c9018fe2cc85d3         0         0        0        0
## ee00598097dd659ddea3f91273ef97f1         3         0        0        0
## f2161a1901a6fb0ba3b90ce278930c8b         0         0        0        0
##                                  CD003-M90
## 7841644759c6fc1e7b024272b6dcf462         0
## 555cdf14776844cf94aeb170e12170de         0
## fc885b5c51d9581397c9018fe2cc85d3         0
## ee00598097dd659ddea3f91273ef97f1         0
## f2161a1901a6fb0ba3b90ce278930c8b         0
tax_table(physeq.rec)[1:5, 1:4]
## Taxonomy Table:     [5 taxa by 4 taxonomic ranks]:
##                                  Kingdom       Phylum        Class        
## 7841644759c6fc1e7b024272b6dcf462 "k__Bacteria" "k__Bacteria" "k__Bacteria"
## 555cdf14776844cf94aeb170e12170de "k__Bacteria" "p__OD1"      "c__"        
## fc885b5c51d9581397c9018fe2cc85d3 "k__Bacteria" "p__OD1"      "c__"        
## ee00598097dd659ddea3f91273ef97f1 "k__Bacteria" "k__Bacteria" "k__Bacteria"
## f2161a1901a6fb0ba3b90ce278930c8b "k__Bacteria" "k__Bacteria" "k__Bacteria"
##                                  Order        
## 7841644759c6fc1e7b024272b6dcf462 "k__Bacteria"
## 555cdf14776844cf94aeb170e12170de "o__"        
## fc885b5c51d9581397c9018fe2cc85d3 "o__"        
## ee00598097dd659ddea3f91273ef97f1 "k__Bacteria"
## f2161a1901a6fb0ba3b90ce278930c8b "k__Bacteria"
# Prune and rarefy samples

# Keep samples with non-zero total taxa
pruned <- prune_samples(sample_sums(physeq.rec) > 0, physeq.rec)

# Check total taxa in each sample again
sample_sums(pruned)
##   CD003-M-1   CD003-M-n    CD003-M2    CD003-M4   CD003-M90   CD005-M-1 
##       23127       66753       11518       15114        7832         691 
##    CD005-M2    CD005-M7   CD005-M90    CD006-M2    CD006-M7   CD006-M90 
##       18367       11422       16151       54268       13651       12670 
##   CD007-M-1    CD007-M2   CD007-M90   CD008-M-1   CD008-M-n    CD008-M2 
##       18494       13948       18290       13488       15800       22539 
##    CD008-M7   CD008-M90   CD010-M-n    CD010-M2    CD010-M7   CD011-M-1 
##       16727       18092       21150       16548       23071        8754 
##   CD011-M-n    CD011-M2    CD011-M7   CD011-M90   CD012-M-n    CD012-M2 
##       16169       11663       15454       12021        9787        8619 
##    CD012-M7   CD012-M90   CD013-M-1   CD013-M-n    CD013-M2    CD013-M7 
##       12514       15027        9787       19680       17429       18001 
##   CD014-M-n  CD014-M2-a  CD014-M2-b    CD014-M7   CD015-M-n    CD015-M2 
##       18408       28218       25480       12155        7475        9797 
##    CD015-M7   CD015-M90   CD016-M-1   CD016-M-n    CD016-M2    CD016-M7 
##       32375       21725       13175       14429       11145       12135 
##   CD017-M-1   CD017-M-n    CD017-M2    CD017-M7   CD018-M-1 CD018-M-n-a 
##       13110       15580       12787       20847        8630       25783 
## CD018-M-n-b   CD018-M11   CD019-M-1   CD019-M-n    CD019-M2    CD019-M7 
##       22231       20305        8715       11447       14326       13832 
## CD020-M-n-a CD020-M-n-b    CD020-M2    CD020-M7    CD021M-n   CD023-M-1 
##       15385        1214       23699        9689       17428        4624 
##   CD023-M-n    CD023-M2    CD023-M7   CD025-M-1    CD025-M2   CD026-M-1 
##       13184       10007       15449        7030        7579       10045 
##   CD026-M-n  CD026-M2-a  CD026-M2-b  CD026-M7-a  CD026-M7-b   CD026-M90 
##        6465       13894       17400       10844       12575       15050 
##   CD028-M-1   CD028-M-n  CD028-M2-b    CD028-M7   CD028-M90   CD029-M-n 
##        7107       16203       16228       14942       19831       12906 
##    CD029-M2    CD029-M7   CD029-M90   CD030-M-n    CD030-M2 CD030-M30-n 
##       16682       19111       11080       14222        6816       11712 
##    CD030-M7    CD031-M2    CD031-M7    CD031M-1   CD033-M-1    CD033-M5 
##       10909       22764       20418       19353        4504       17612 
##    CD033-M7   CD033-M90   CD034-M-1   CD034-M-n    CD034-M2    CD034-M7 
##       17815       46321       11816       15245       21461       13800 
##   CD035-M-n    CD035-M2    CD035-M7   CD036-M-1   CD036-M-n    CD036-M2 
##       22023       31698       24964       13609       10941        5731 
##    CD036-M7 CD036-M90-a CD036-M90-b 
##        7356        5593        2767
#rarefy samples
physeq_rarefy <- rarefy_even_depth(pruned, rngseed=1, sample.size=0.9*min(sample_sums(pruned)), replace=F)
## `set.seed(1)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(1); .Random.seed` for the full vector
## ...
## 13799OTUs were removed because they are no longer 
## present in any sample after random subsampling
## ...
#Check the taxa prevalence at Phylum level
#plot_taxa_prevalence(physeq_rarefy, "Order")

Barplots are a one way of visualising the composition of your samples. At Family level and relative abundance

# convert to relative abundance  
physeq.rec.rel <- microbiome::transform(physeq.rec, "compositional")

physeq.rec.rel2 <- prune_taxa(taxa_sums(physeq.rec.rel) > 0, physeq.rec.rel)

Check for the core ASVs

core.taxa.standard <- core_members(physeq.rec.rel2, detection = 0.001, prevalence = 50/100)
print(core.taxa.standard)
## [1] "4f3d767e0404daed3f01328053b70ae8"

we only see IDs, not very informative. We can get the classification of these as below.

#install.packages('DT')
library(DT)
# Extract the taxonomy table
taxonomy_core <- as.data.frame(tax_table(physeq.rec.rel2))

# Subset this taxonomy table to include only core OTUs
core_taxa_id <- subset(taxonomy_core, rownames(taxonomy_core) %in% core.taxa.standard)

DT::datatable(core_taxa_id)
physeq.fam.rel <- physeq.rec %>%
  aggregate_rare(level = "Family", detection = 1/100, prevalence = 75/100) %>%
  microbiome::transform(transform = "compositional")

plot_composition(physeq.fam.rel, sample.sort = "FMT_pre_post", x.label = "FMT_pre_post") + 
  theme(legend.position = "bottom") + 
  scale_fill_brewer("Family", palette = "Paired") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, size = 10)) + 
  ggtitle("Relative abundance") + 
  theme(legend.title = element_text(size = 18))

library(RColorBrewer)

physeq.fam.rel <- physeq.rec %>%
  aggregate_rare(level = "Family", detection = 1/100, prevalence = 50/100) %>%
  microbiome::transform(transform = "compositional") 

colourCount = ntaxa(tax_table(physeq.fam.rel))
getPalette = colorRampPalette(brewer.pal(13, "Paired"))
## Warning in brewer.pal(13, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
plot_composition(physeq.fam.rel,
                 average_by = "FMT_pre_post", 
                 transform = "compositional") + 
  scale_fill_manual(values = getPalette(colourCount)) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, size = 10)) + 
  ggtitle("Relative abundance") + 
  theme(legend.title = element_text(size = 18))

phyloseq::plot_bar(physeq.fam.rel, fill = "Family") +
  geom_bar(aes(fill = Family), stat = "identity", position = "stack") +
  labs(x = "", y = "Relative Abundance\n") +
  facet_wrap(~ FMT_pre_post, scales = "free", nrow =2) +
  theme(panel.background = element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  scale_fill_manual(values = getPalette(colourCount))

# install.packages(
#   "microViz",
#   repos = c(davidbarnett = "https://david-barnett.r-universe.dev", getOption("repos"))
# )
#library(microViz)
# 
# physeq.rec <- physeq.rec %>%
#  tax_fix(
#   min_length = 4,
#   unknowns = c("k__Bacteria", "p__Firmicutes", "p__Proteobacteria", "p__OD1 Phylum", "p__OD1"),
#   sep = " ", anon_unique = TRUE,
#   suffix_rank = "classified"
#  )
# 
# 
# plot_family <- comp_barplot(
#   physeq.rec.rel2,
#   "Family",
#   n_taxa = 4,
#   tax_order = sum,
#   merge_other = TRUE,
#   sample_order = "bray",
#   order_with_all_taxa = FALSE,
#   label = "SAMPLE",
#   facet_by = "condition",
#   bar_width = 1,
#   bar_outline_colour = NA,
#   #palette = distinct_palette(n_taxa),
#   tax_transform_for_ordering = "identity",
#   tax_transform_for_plot = "compositional",
#   #seriate_method = "OLO_ward",
#   keep_all_vars = TRUE,
#   #interactive = FALSE,
#   x = "SAMPLE"
# )
# 
# plot_family
mycols <- c("coral", "steelblue2")

physeq.fam <- aggregate_taxa(physeq.rec, "Family")
top_f <- top_taxa(physeq.fam, 9)
top_f
## [1] "f__Enterobacteriaceae"  "f__Lachnospiraceae"     "f__Verrucomicrobiaceae"
## [4] "f__Ruminococcaceae"     "f__Veillonellaceae"     "f__Bacteroidaceae"     
## [7] "f__Enterococcaceae"     "f__Lactobacillaceae"    "f__Streptococcaceae"
top_fams <- plot_listed_taxa(physeq.fam, top_f, 
                 group= "FMT_pre_post",
                 group.colors = mycols,
                 add.violin = F,
                 dot.opacity = 0.25,
                 box.opacity = 0.25,
                 panel.arrange= "wrap")
## An additonal column Sam_rep with sample names is created for reference purpose
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the microbiomeutilities package.
##   Please report the issue at
##   <https://github.com/microsud/microbiomeutilities/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
top_fams

dom.tax <- dominant_taxa(physeq.rec,level = "Family", group="FMT_pre_post")
dom.tax_pre <- dom.tax$dominant_overview %>% filter(FMT_pre_post == "pre")
dom.tax_post <- dom.tax$dominant_overview %>% filter(FMT_pre_post == "post")

head(dom.tax$dominant_overview)
## # A tibble: 6 × 5
## # Groups:   FMT_pre_post [2]
##   FMT_pre_post dominant_taxa              n rel.freq rel.freq.pct
##   <fct>        <chr>                  <int>    <dbl> <chr>       
## 1 post         f__Enterobacteriaceae     13     18.6 19%         
## 2 post         f__Lachnospiraceae        13     18.6 19%         
## 3 post         f__Verrucomicrobiaceae    11     15.7 16%         
## 4 post         f__Ruminococcaceae        10     14.3 14%         
## 5 post         f__Bacteroidaceae          9     12.9 13%         
## 6 pre          f__Verrucomicrobiaceae     7     16.7 17%
head(dom.tax_pre, 10) %>%
  flextable()

FMT_pre_post

dominant_taxa

n

rel.freq

rel.freq.pct

pre

f__Verrucomicrobiaceae

7

16.7

17%

pre

f__Enterobacteriaceae

6

14.3

14%

pre

f__Enterococcaceae

6

14.3

14%

pre

f__Veillonellaceae

6

14.3

14%

pre

f__Bacteroidaceae

5

11.9

12%

pre

f__Lactobacillaceae

5

11.9

12%

pre

f__Ruminococcaceae

3

7.1

7%

pre

f__Lachnospiraceae

2

4.8

5%

pre

f__Fusobacteriaceae

1

2.4

2%

pre

f__Porphyromonadaceae

1

2.4

2%

head(dom.tax_post, 10)%>%
  flextable()

FMT_pre_post

dominant_taxa

n

rel.freq

rel.freq.pct

post

f__Enterobacteriaceae

13

18.6

19%

post

f__Lachnospiraceae

13

18.6

19%

post

f__Verrucomicrobiaceae

11

15.7

16%

post

f__Ruminococcaceae

10

14.3

14%

post

f__Bacteroidaceae

9

12.9

13%

post

f__Enterococcaceae

4

5.7

6%

post

f__Lactobacillaceae

4

5.7

6%

post

f__Veillonellaceae

2

2.9

3%

post

f__Bifidobacteriaceae

1

1.4

1%

post

f__Desulfovibrionaceae

1

1.4

1%

grp_abund <- get_group_abundances(physeq.rec, 
                                  level = "Family", 
                                  group="FMT_pre_post",
                                  transform = "compositional")
## An additonal column Sam_rep with sample names is created for reference purpose
# clean names 
grp_abund$OTUID <- gsub("f__", "",grp_abund$OTUID)
grp_abund$OTUID <- ifelse(grp_abund$OTUID == "", 
                          "Unclassified", grp_abund$OTUID)

grp_abund_sorted <- grp_abund[order(grp_abund$mean_abundance, decreasing=TRUE),]


mean.plot_sorted <- grp_abund_sorted[1:40,] %>% # input data
  ggplot(aes(x= reorder(OTUID, mean_abundance), # reroder based on mean abundance
             y= mean_abundance,
             fill=FMT_pre_post)) + 
  geom_bar(stat = "identity", 
          position = position_dodge()) + 
  scale_fill_manual("FMT_pre_post", values=mycols) + # manually specify colors
  theme_bw() +
  ylab("Mean Relative Abundance") + 
  xlab("Family") + 
  coord_flip() 
# +
#   scale_fill_manual(values = getPalette(3)) 

mean.plot_sorted

Tree plot

physeq_top_50 <- subset_taxa(physeq.rec, Kingdom=="k__Bacteria")
physeq_top_50 <- prune_taxa(names(sort(taxa_sums(physeq_top_50),TRUE)[1:50]), physeq_top_50)

# Color the nodes by category
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="FMT_pre_post")

# Convert to radial tree
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="FMT_pre_post") + coord_polar(theta="y")

Lefse to test for differential abundance between categories

from: https://rpubs.com/mohsen/lefse_analysis_cleaned_prevalence_phyloseq

# Install microbiome marker package
#remotes::install_version('ggplot2', version='3.3.6')
#BiocManager::install("ggtree")
#BiocManager::install("microbiomeMarker")

# activate microbiome marker package 
library(microbiomeMarker)
## Registered S3 method overwritten by 'ggtree':
##   method      from 
##   identify.gg ggfun
## 
## Attaching package: 'microbiomeMarker'
## The following objects are masked from 'package:microbiome':
## 
##     abundances, aggregate_taxa
## The following object is masked from 'package:qiime2R':
## 
##     summarize_taxa
## The following object is masked from 'package:phyloseq':
## 
##     plot_heatmap
# Differential abundance using lefse
lefse <- run_lefse(physeq.rec,
                   group = "FMT_pre_post",
                   taxa_rank = "Family",
                   wilcoxon_cutoff = 0.05,
                   norm = "CPM",
                   kw_cutoff = 0.05,
                   multigrp_strat = TRUE,
                   lda_cutoff = 2
)
## Warning in preprocess_ps(ps): The library size of sample(s): CD028-M2-a is/are
## zero, and will be removed in the subsequent analysis.
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
#head(marker_table(lefse))

# bar plot
plot_ef_bar(lefse)

# dot plot
plot_ef_dot(lefse)

library(knitr)
dat <- marker_table(lefse) %>% data.frame() %>% select(1:4)

dat %>% flextable()

feature

enrich_group

ef_lda

pvalue

f__Peptostreptococcaceae

pre

3.751640

0.000065467363

o__Clostridiales_k__Bacteria

pre

3.493069

0.012386655594

f__[Paraprevotellaceae]

pre

2.852463

0.025074627500

f__Leptotrichiaceae

pre

2.693686

0.016870489180

f__Ruminococcaceae

post

4.767258

0.001687499285

f__Lachnospiraceae

post

4.611532

0.011256072336

f__Rikenellaceae

post

4.544018

0.000002408661

f__Bifidobacteriaceae

post

4.104626

0.007168055327

f__Erysipelotrichaceae

post

4.079664

0.024810140922

f__[Barnesiellaceae]

post

3.808157

0.010962494793

o__Clostridiales_f__

post

3.778734

0.008928572756

f__Clostridiaceae

post

3.738037

0.008496590045

f__[Odoribacteraceae]

post

3.585347

0.004615594153

f__Christensenellaceae

post

3.174403

0.002367646033

f__[Mogibacteriaceae]

post

3.107154

0.000823396201

Lefse with lefser package

How to transform data in SE format: https://microbiome.github.io/OMA/data-introduction.html

# #BiocManager::install("lefser")
# library(lefser)
# #BiocManager::install("microbiome/mia", version='3.14')
# library(mia)
# 
# counts  <- ASVs$data   # Abundance table (e.g. ASV data; to assay data)
# tax     <- tax_table     # Taxonomy table (to rowData)
# samples <-  metadata  # Sample data (to colData)
# se <- SummarizedExperiment(assays = list(counts = counts),
#                            colData = samples,
#                            rowData = tax)
# 
# # Goes through the whole DataFrame. Removes '.*[kpcofg]__' from strings, where [kpcofg] 
# # is any character from listed ones, and .* any character.
# rowdata_modified <- BiocParallel::bplapply(rowData(se), 
#                                            FUN = stringr::str_remove, 
#                                            pattern = '.*[kpcofg]__')
# 
# # Genus level has additional '\"', so let's delete that also
# rowdata_modified <- BiocParallel::bplapply(rowdata_modified, 
#                                            FUN = stringr::str_remove, 
#                                            pattern = '\"')
# 
# # rowdata_modified is a list, so it is converted back to DataFrame format. 
# rowdata_modified <- DataFrame(rowdata_modified)
# 
# # And then assigned back to the SE object
# rowData(se) <- rowdata_modified
# 
# # Now we have a nicer table
# head(rowData(se))
# 
# head(colData(se))
# 
# #add relative abundances along the original count data 
# se <- relAbundanceCounts(se)
# assays(se)
# 
# assays(se)$counts[1:5,1:7]
# assay(se, "relabundance")[1:5,1:7]
# 
# dim(se)
# 
# #add phylogenetic tree
# tse <- as(se, "TreeSummarizedExperiment")
# # Add tree to rowTree
# rowTree(tse) <- tree$data
# # Check
# tse
# 
# se <- tse
# 
# #head(rowTree(tse))
# rowLinks(tse)
# 
# # subset by sample purpose
# se_subset_by_sample <- se[ , se$sample.purpose %in% c("Receptor")]
# 
# # show dimensions
# dim(se_subset_by_sample)
# 
# head(colData(se_subset_by_sample))
# 
# # add metadata column
# 
# for(i in 1:dim(se_subset_by_sample)[2]) {
#   if(se_subset_by_sample$time[i] <= 0){
#     se_subset_by_sample$FMT_pre_post[i] <- "pre"
#   } else {
#     se_subset_by_sample$FMT_pre_post[i] <- "post"
#   }
# }
# 
# 
# # subset by sample and feature and get rid of NAs
# se_subset_by_sample_feature <- se[rowData(se)$Phylum %in% c("Bacteria") & !is.na(rowData(se)$Phylum), se$sample.purpose %in% c("Receptor")]
# 
# # show dimensions
# dim(se_subset_by_sample_feature)
# table(se_subset_by_sample$FMT_pre_post)
# res <- lefser(se_subset_by_sample, groupCol = "FMT_pre_post")
# head(res)
# lefserPlot(res)

Donor data vs CDI data

# Importing ASVs abundance file
ASVs_merged <- read_qza(here("00_raw_data/09_hospital_data_qiime2","merged-table.qza"))

# Importing metadata
metadata_CDI <- read.delim(here("00_raw_data/07_hospital_CDI", "sample-metadata.tsv.gz"))
metadata_CDI$time <- as.numeric(metadata_CDI$time)
names(metadata_CDI)[names(metadata_CDI) == 'Sample.ID'] <- 'SampleID'
names(metadata_CDI)[names(metadata_CDI) == 'sample.origin'] <- 'condition'


for(i in 1:nrow(metadata_CDI)) {
  if(metadata_CDI$time[i] <= 0 & !is.na(metadata_CDI$time[i])){
    metadata_CDI$condition[i] <- paste(metadata_CDI$condition[i], "pre", sep="_")
  } else if (metadata_CDI$time[i] > 0 & !is.na(metadata_CDI$time[i])) {
    metadata_CDI$condition[i] <- paste(metadata_CDI$condition[i], "post", sep="_")
  }
}

metadata_donor <- read.delim(here("00_raw_data/08_hospital_donor","sample-metadata.tsv.gz"))
metadata_donor$condition <- "healthy_donors"
names(metadata_donor)[names(metadata_donor) == 'sample_id'] <- 'SampleID'


metadata_merged <- rbind.fill(metadata_CDI, metadata_donor)
metadata_merged <- dplyr::select(metadata_merged, SampleID, condition)
rownames(metadata_merged) <- metadata_merged[,1]


# Importing tree
tree_merged <- read_qza(here("00_raw_data/09_hospital_data_qiime2","rooted-tree-merged.qza"))

# Importing taxonomy
taxonomy_merged <- read_qza(here("00_raw_data/09_hospital_data_qiime2","taxonomy-merged.qza"))
taxonomy_table_merged <- parse_taxonomy(taxonomy_merged$data)
tax_table_merged <- do.call(rbind, strsplit(as.character(taxonomy_merged$data$Taxon), "; "))
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)
colnames(tax_table_merged) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
rownames(tax_table_merged) <- taxonomy_merged$data$Feature.ID

# Creating phyloseq object
physeq_merged <- phyloseq(
  otu_table(ASVs_merged$data, taxa_are_rows = TRUE),
  phy_tree(tree_merged$data),
  tax_table(tax_table_merged),
  sample_data(metadata_merged)
)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
#Subset the data to keep only Receptor samples.
physeq_merged <- subset_samples(physeq_merged, condition == c("Cdif_pre", "Cdif_post", "healthy_donors"))
## Warning in condition == c("Cdif_pre", "Cdif_post", "healthy_donors"): longer
## object length is not a multiple of shorter object length
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
sample_data(physeq_merged)$condition <- as.factor(sample_data(physeq_merged)$condition)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
sample_data(physeq_merged)$condition <- relevel(sample_data(physeq_merged)$condition, "Cdif_pre")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
levels(sample_data(physeq_merged)$condition)
## [1] "Cdif_pre"       "Cdif_post"      "healthy_donors"

Barplots :

# convert to relative abundance  
physeq.rel <- microbiome::transform(physeq_merged, "compositional")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
physeq.rel2 <- prune_taxa(taxa_sums(physeq.rel) > 0, physeq.rel)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
physeq.fam.rel <- physeq_merged %>%
  aggregate_rare(level = "Family", detection = 1/100, prevalence = 75/100) %>%
  microbiome::transform(transform = "compositional")

plot_composition(physeq.fam.rel, sample.sort = "condition", x.label = "condition") + 
  theme(legend.position = "bottom") + 
  scale_fill_brewer("Family", palette = "Paired") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, size = 10)) + 
  ggtitle("Relative abundance") + 
  theme(legend.title = element_text(size = 18))

 library(RColorBrewer)

physeq.fam.rel <- physeq_merged %>%
  aggregate_rare(level = "Family", detection = 1/100, prevalence = 50/100) %>%
  microbiome::transform(transform = "compositional") 

colourCount = ntaxa(tax_table(physeq.fam.rel))
getPalette = colorRampPalette(brewer.pal(13, "Paired"))
## Warning in brewer.pal(13, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
plot_composition(physeq.fam.rel,
                 average_by = "condition", 
                 transform = "compositional") + 
  scale_fill_manual(values = getPalette(colourCount)) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, size = 10)) + 
  ggtitle("Relative abundance") + 
  theme(legend.title = element_text(size = 18))

mycols <- c("coral", "steelblue2", "darkolivegreen2")

physeq.fam <- aggregate_taxa(physeq_merged, "Family")

sample_data(physeq.fam)$condition <- as.factor(str_replace(sample_data(physeq.fam)$condition, "Cdif_pre", "Pre"))
sample_data(physeq.fam)$condition <- as.factor(str_replace(sample_data(physeq.fam)$condition, "Cdif_post", "Post"))
sample_data(physeq.fam)$condition <- as.factor(str_replace(sample_data(physeq.fam)$condition, "healthy_donors", "Donors"))

sample_data(physeq.fam)$condition <- relevel(sample_data(physeq.fam)$condition, ref = "Post")
sample_data(physeq.fam)$condition <- relevel(sample_data(physeq.fam)$condition, ref = "Pre")

top_f <- top_taxa(physeq.fam, 9)
top_f
## [1] "f__Bacteroidaceae"      "f__Ruminococcaceae"     "f__Lachnospiraceae"    
## [4] "f__Veillonellaceae"     "f__Enterobacteriaceae"  "f__Rikenellaceae"      
## [7] "f__Verrucomicrobiaceae" "f__Porphyromonadaceae"  "f__Prevotellaceae"
top_fams <- plot_listed_taxa(physeq.fam,
                             top_f,
                             group= "condition",
                             group.colors = mycols,
                             add.violin = F,
                             dot.opacity = 0.25,
                             box.opacity = 0.25,
                             panel.arrange= "wrap")
## An additonal column Sam_rep with sample names is created for reference purpose
top_fams

dom.tax <- dominant_taxa(physeq_merged, level = "Family", group="condition")
dom.tax_pre <- dom.tax$dominant_overview %>% filter(condition == "Cdif_pre")
dom.tax_post <- dom.tax$dominant_overview %>% filter(condition == "Cdif_post")
dom.tax_donor <- dom.tax$dominant_overview %>% filter(condition == "healthy_donors")

head(dom.tax_pre) %>%
  flextable()

condition

dominant_taxa

n

rel.freq

rel.freq.pct

Cdif_pre

f__Bacteroidaceae

3

17.6

18%

Cdif_pre

f__Enterococcaceae

3

17.6

18%

Cdif_pre

f__Veillonellaceae

3

17.6

18%

Cdif_pre

f__Enterobacteriaceae

2

11.8

12%

Cdif_pre

f__Ruminococcaceae

2

11.8

12%

Cdif_pre

f__Verrucomicrobiaceae

2

11.8

12%

head(dom.tax_post)%>%
  flextable()

condition

dominant_taxa

n

rel.freq

rel.freq.pct

Cdif_post

f__Enterobacteriaceae

5

21.7

22%

Cdif_post

f__Lachnospiraceae

5

21.7

22%

Cdif_post

f__Bacteroidaceae

3

13.0

13%

Cdif_post

f__Ruminococcaceae

3

13.0

13%

Cdif_post

f__Verrucomicrobiaceae

3

13.0

13%

Cdif_post

f__Lactobacillaceae

2

8.7

9%

head(dom.tax_donor)%>%
  flextable()

condition

dominant_taxa

n

rel.freq

rel.freq.pct

healthy_donors

f__Bacteroidaceae

28

71.8

72%

healthy_donors

f__Ruminococcaceae

5

12.8

13%

healthy_donors

f__Lachnospiraceae

3

7.7

8%

healthy_donors

f__Prevotellaceae

3

7.7

8%

grp_abund <- get_group_abundances(physeq_merged, 
                                  level = "Family", 
                                  group="condition",
                                  transform = "compositional")
## An additonal column Sam_rep with sample names is created for reference purpose
# clean names 
grp_abund$OTUID <- gsub("f__", "",grp_abund$OTUID)
grp_abund$OTUID <- ifelse(grp_abund$OTUID == "", 
                          "Unclassified", grp_abund$OTUID)

grp_abund_sorted <- grp_abund[order(grp_abund$mean_abundance, decreasing=TRUE),]


mean.plot_sorted <- grp_abund_sorted[1:40,] %>% # input data
  ggplot(aes(x= reorder(OTUID, mean_abundance), # reroder based on mean abundance
             y= mean_abundance,
             fill=condition)) + 
  geom_bar(stat = "identity", 
          position = position_dodge()) + 
  scale_fill_manual("condition", values=mycols) + # manually specify colors
  theme_bw() +
  ylab("Mean Relative Abundance") + 
  xlab("Family") + 
  coord_flip()

mean.plot_sorted

Tree plot

physeq_top_50 <- subset_taxa(physeq_merged, Kingdom=="k__Bacteria")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
physeq_top_50 <- prune_taxa(names(sort(taxa_sums(physeq_top_50),TRUE)[1:50]), physeq_top_50)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
# Color the nodes by category
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="condition")

# Convert to radial tree
plot_tree(physeq_top_50, nodelabf=nodeplotblank, label.tips="Genus", ladderize="left", color="condition") + coord_polar(theta="y")

Lefse to test for differential abundance between categories

from: https://rpubs.com/mohsen/lefse_analysis_cleaned_prevalence_phyloseq

# Differential abundance using lefse
lefse <- run_lefse(physeq_merged,
                   group = "condition",
                   taxa_rank = "Family",
                   wilcoxon_cutoff = 0.05,
                   norm = "CPM",
                   kw_cutoff = 0.05,
                   multigrp_strat = TRUE,
                   lda_cutoff = 2
)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
# bar plot
plot_ef_bar(lefse)

# dot plot
plot_ef_dot(lefse)

library(knitr)
dat <- marker_table(lefse) %>% data.frame() %>% select(1:4)

dat %>% flextable()

feature

enrich_group

ef_lda

pvalue

f__Enterobacteriaceae

Cdif_pre

5.019863

0.00000722865924779

f__Enterococcaceae

Cdif_pre

4.888719

0.00000041798678037

f__Lactobacillaceae

Cdif_pre

4.768221

0.00000000003656169

f__Peptostreptococcaceae

Cdif_pre

3.882746

0.00001126451367478

k__Bacteria_k__Bacteria_k__Bacteria_k__Bacteria

Cdif_pre

3.382887

0.00002234050035835

f__Carnobacteriaceae

Cdif_pre

3.355318

0.00022394112443342

f__Actinomycetaceae

Cdif_pre

3.078969

0.00210243838091560

f__[Tissierellaceae]

Cdif_pre

3.058240

0.01088136973394066

f__Pasteurellaceae

Cdif_pre

3.038437

0.01423156416571151

f__Leptotrichiaceae

Cdif_pre

2.613028

0.02013020468359838

f__Gemellaceae

Cdif_pre

2.582525

0.00342559972775908

f__Corynebacteriaceae

Cdif_pre

2.234964

0.02487689077094919

f__Micrococcaceae

Cdif_pre

2.125729

0.00365354080850151

f__Neisseriaceae

Cdif_pre

2.083472

0.02487689077094919

o__Lactobacillales_k__Bacteria

Cdif_pre

2.044632

0.01327155137579679

f__Streptococcaceae

Cdif_post

4.727100

0.00000023814153347

f__Fusobacteriaceae

Cdif_post

4.043192

0.00033522810850423

f__Clostridiaceae

Cdif_post

3.735624

0.00125214914383243

o__Clostridiales_k__Bacteria

Cdif_post

3.576153

0.03194429025549037

f__Leuconostocaceae

Cdif_post

2.928322

0.00246241049233072

f__[Mogibacteriaceae]

Cdif_post

2.846803

0.01080665000498568

f__Bacteroidaceae

healthy_donors

5.223932

0.00000000262051031

f__Ruminococcaceae

healthy_donors

5.003683

0.00000010823078621

f__Lachnospiraceae

healthy_donors

4.827433

0.00129259456138810

f__Rikenellaceae

healthy_donors

4.389663

0.00001242198127422

f__Porphyromonadaceae

healthy_donors

4.312682

0.00000000108484095

f__[Barnesiellaceae]

healthy_donors

4.028017

0.00000000039221370

f__[Paraprevotellaceae]

healthy_donors

3.977319

0.00001201496291125

o__Clostridiales_f__

healthy_donors

3.955847

0.00099584336046843

f__[Odoribacteraceae]

healthy_donors

3.809392

0.00000044752653400

f__Christensenellaceae

healthy_donors

3.527962

0.00102425156499283

f__S24-7

healthy_donors

3.364287

0.00314413219170438

o__RF32_f__

healthy_donors

3.237709

0.00114169956828998

o__YS2_f__

healthy_donors

3.148024

0.03435315417650129

o__RF39_f__

healthy_donors

2.331961

0.00598693092425966

f__Oxalobacteraceae

healthy_donors

2.196756

0.01885911458068856

f__Peptococcaceae

healthy_donors

2.102568

0.02069257446309545